Aqui começamos a escrever a introdução do nosso material.
Aqui continuamos
A seguir vamos colocando outros ítens tipográficos
1.1.2.1.1.1.1 Subsubparágrafo
Um subsubparágrafo é aceito pelo markdown mas não é definido tipograficamente. Normalmente usamos só até o nível 6, que é o subparágrafo.
Escrevendo uma equação
$$
y = ax^2+bx+c
$$
\[ y = ax^2+bx+c \]
```{r,}
2 + 2
## [1] 4
```
```{r,}
set.seed(1)
.x <- rnorm(10000)
head(.x, 10)
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078 -0.8204684
## [7] 0.4874291 0.7383247 0.5757814 -0.3053884
tail(.x, 10)
## [1] 1.04776175 -0.02428861 -0.47787499 -0.02971747 0.20966546 0.95950757
## [7] 0.43660362 0.49936656 0.89397983 0.25738706
```
```{r,}
hist(.x, freq = FALSE, col = "yellow", border = "black")
hist(.x, freq = FALSE, col = "lightyellow", border = "gray")
curve(dnorm(x, mean(.x), sd(.x)), xlim = c(min(.x), max(.x)),
add = TRUE, col = "#4040FFC0", lwd = 2.5)
```
```{r,}
1/2
## [1] 0.5
```
PoEdata_0.1.0.tar.gzUma vez instalado o pacote PoEdata_0.1.0.tar.gz
```{r,}
library(PoEdata)
```
```{r,}
library(printr)
## Registered S3 method overwritten by 'printr':
## method from
## knit_print.data.frame rmarkdown
data(mroz)
head(mroz[, 1:5])
| taxableinc | federaltax | hsiblings | hfathereduc | hmothereduc |
|---|---|---|---|---|
| 12200 | 1494 | 1 | 14 | 16 |
| 18000 | 2615 | 8 | 7 | 3 |
| 24000 | 3957 | 4 | 7 | 10 |
| 16400 | 2279 | 6 | 7 | 12 |
| 10000 | 1063 | 3 | 7 | 7 |
| 6295 | 370 | 8 | 7 | 7 |
tail(mroz[, 1:5])
| taxableinc | federaltax | hsiblings | hfathereduc | hmothereduc | |
|---|---|---|---|---|---|
| 748 | 16100 | 1825 | 0 | 7 | 12 |
| 749 | 32000 | 4701 | 8 | 12 | 7 |
| 750 | 18500 | 2720 | 4 | 12 | 12 |
| 751 | 13000 | 1642 | 6 | 7 | 12 |
| 752 | 17200 | 2447 | 2 | 7 | 10 |
| 753 | 18700 | 2327 | 4 | 10 | 7 |
```
R```{r,warning=FALSE}
library(DescTools)
plotSquare = function(deltay = 1.5, deltax = abs(deltay/Asp()),
xbase = 12, ybase = 3, col = "#FF808040") {
polygon(c(0, deltax, deltax, 0, 0) + xbase, c(0, 0, deltay,
deltay, 0) + ybase, col = col)
}
plotRegressao = function(modelo1 = modelo1, horas = horas, nota = nota,
sequencia = 5, sub = NULL) {
# desenho os pontos observados
plot(horas, nota, pch = 20, col = "darkgray", xlim = c(0,
14), ylim = c(0, 100), axes = FALSE, sub = "Regressão linear simples",
xlab = "Horas de estudo", ylab = "Nota na avaliação",
main = sub)
# desenho os eixos no (0, 0)
axis(1, pos = 0)
axis(2, pos = 0)
# traça a linha do modelo1 em azul
if (sequencia > 1)
abline(modelo1, col = "blue")
# calcula os pontos sobre a reta
estimados <- predict(modelo1, horas = horas)
# desenha os pontos sobre a reta
if (sequencia > 2)
points(horas, estimados, pch = 20, col = "blue")
# desenha as barras de erro (y - ychapéu) e dá nome aos
# pontos
delta = 0.3
if (sequencia > 3) {
for (i in 1:length(horas)) {
# desenha as barras de erro verticais
lines(c(horas[i], horas[i]), c(estimados[i], nota[i]),
col = "red")
# desenha as linhas horizontais
lines(c(horas[i] - delta, horas[i] + delta), c(estimados[i],
estimados[i]), col = "red")
lines(c(horas[i] - delta, horas[i] + delta), c(nota[i],
nota[i]), col = "red")
# coloca o nome do ponto acima ou abaixo dele
# conforme a estética
text(horas[i], nota[i], bquote(y[.(i)]), pos = ifelse(estimados[i] >
nota[i], 1, 3))
}
}
if (sequencia > 4) {
for (i in 1:length(horas)) {
deltay = nota[i] - estimados[i]
deltax = abs(deltay/Asp())
if (deltay > 0)
deltax = -deltax
plotSquare(xbase = horas[i], ybase = estimados[i],
deltay = deltay, deltax = deltax)
}
}
text(8, 10, expression(min(Sigma(y[i] - bar(y[i]))^2)), cex = 1.2)
text(8, 15, "Mínimos quadrados ordinários minimiza o somatório dos quadrados dos erros",
cex = 0.7)
}
```
```{r,fig.asp=1, fig.cap="Nota na avaliação *versus* horas de estudo", fig.align='center'}
# conjuntos de dados
nota <- c(40, 30, 60, 65, 70, 90)
horas <- c(2, 4, 6, 8, 10, 12)
# função lm (linear model) -> guarda em modelo1
lm(nota ~ horas) -> modelo1
plotRegressao(modelo1, horas, nota, 1, expression("Nuvem de pontos"))
Figure 5.1: Nota na avaliação versus horas de estudo
plotRegressao(modelo1, horas, nota, 2, expression(paste("Reta de regressão: ",
nota == b[0] + b[1] * horas)))
Figure 5.2: Nota na avaliação versus horas de estudo
plotRegressao(modelo1, horas, nota, 3, expression(paste("Reta de regressão com as estimativas ",
hat(y))))
Figure 5.3: Nota na avaliação versus horas de estudo
plotRegressao(modelo1, horas, nota, 4, expression(paste("Erros observados: ",
y - hat(y))))
Figure 5.4: Nota na avaliação versus horas de estudo
plotRegressao(modelo1, horas, nota, 5, expression(paste("Quadrados dos erros: ",
(y - hat(y))^2)))
Figure 5.5: Nota na avaliação versus horas de estudo
```
```{r,}
modelo1
##
## Call:
## lm(formula = nota ~ horas)
##
## Coefficients:
## (Intercept) horas
## 21.667 5.357
modelo1$coefficients[1]
## (Intercept)
## 21.66667
modelo1$coefficients[2]
## horas
## 5.357143
```
$$
\widehat{\text{nota}} = b_0+b_1\cdot \text{horas} = 21.6666667+5.3571429\cdot \text{horas}
$$
\[ \widehat{\text{nota}} = b_0+b_1\cdot \text{horas} = 21.6666667+5.3571429\cdot \text{horas} \]
Diagnósticos do modelo
Estatística = testes de hipóteses
Uma hipótese pode ser rejeitada ou não
Existe um valor calculado para cada teste que se chama p.value (p-valor) para o qual existe um valor crítico, normalmente tomado como 0,05 (ou 5%) que chamamos de significância do teste.
Todo teste tem uma hipótese nula (\(H_0\)), se o p-valor for menor que o limite, rejeita-se \(H_0\), se for igual ou maior, aceita-se \(H_0\).
\(H_0\) do teste F: não há relação entre as variáveis.
\(H_0\) do teste t: o coeficiente associado é igual a zero.
```{r,}
summary(nota)
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|
| 30 | 45 | 62.5 | 59.16667 | 68.75 | 90 |
summary(horas)
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|
| 2 | 4.5 | 7 | 7 | 9.5 | 12 |
summary(.x)
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. |
|---|---|---|---|---|---|
| -3.6713 | -0.6733944 | -0.0159288 | -0.006537 | 0.6776605 | 3.810277 |
summary(modelo1)
##
## Call:
## lm(formula = nota ~ horas)
##
## Residuals:
## 1 2 3 4 5 6
## 7.6190 -13.0952 6.1905 0.4762 -5.2381 4.0476
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.667 8.221 2.636 0.0578 .
## horas 5.357 1.055 5.076 0.0071 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.83 on 4 degrees of freedom
## Multiple R-squared: 0.8656, Adjusted R-squared: 0.832
## F-statistic: 25.76 on 1 and 4 DF, p-value: 0.007102
```
\(R^2\) é a porção de variação da nota que é explicada pelas horas.
Ou seja, aproximadamente 83% da variação da nota é explicada pelas horas de estudo.
O RStudio é bastante competente para lidar com elementos gráficos e com equações, tornando-o apropriado para a escrita acadêmica de textos dessa natureza.
```{r,}
x = 1:6 * 10
y = 1:6
plot(x, y)
plotSquare(xbase = 20, ybase = 2, deltay = 0.5)
plotSquare(xbase = 30, ybase = 3, deltay = 1.5, col = "#8080FF40")
plotSquare(xbase = 40, ybase = 4, deltay = -2.5, col = "#80FF8040")
plotSquare(xbase = 50, ybase = 5, deltay = -1.5, col = "#FFFF8040")
abline(c(0, 0.1), col = "darkgray")
```
```{r,}
f = function(x) (x^3 + 8)/(x^4 - 16)
x = seq(-3, -1.0000001, length.out = 100)
y = f(x)
plot(x, y, type = "l")
abline(v = -2, h = -3/8, col = "lightblue")
text(-2.75, -0.4, expression(y == lim(frac(x^3 + 8, x^4 - 16),
x %->% -2)))
text(-1.75, -0.35, expression(y(-2) == "?"))
f(-2.000000001)
## [1] -0.375
-3/8
## [1] -0.375
```
Os que estudaram Cálculo talvez se lembrem da regra de L’Hôpital:
\[ \lim_{x\rightarrow-2}\dfrac{x^3+8}{x^4-16}=\lim_{x\rightarrow-2}\dfrac{\rm d(x^3+8)/\rm dx}{\rm d(x^4-16)/\rm dx}= \lim_{x\rightarrow-2}\dfrac{3x^2}{4x^3} = \dfrac{3(-2)^2}{4(-2)^3} = \dfrac{12}{-32} = -\dfrac{3}{8}=-0.375 \] Mais um exemplo:
```{r,}
f = function(x) (exp(x) - 1)/(x)
x = seq(-3, 1, length.out = 100)
y = f(x)
plot(x, y, type = "l")
abline(v = 0, h = 1, col = "lightblue")
text(-2.5, 1.5, expression(y == lim(frac(e^x - 1, x), x %->%
0)))
text(-2.5, 1.2, expression(y(0) == "?"))
```
```{r,}
plot.new()
plot.window(c(0, 4), c(15, 1))
text(1, 1, "universal", adj = 0)
text(2.5, 1, "\\042")
text(3, 1, expression(symbol("\"")))
text(1, 2, "existential", adj = 0)
text(2.5, 2, "\\044")
text(3, 2, expression(symbol("$")))
text(1, 3, "suchthat", adj = 0)
text(2.5, 3, "\\047")
text(3, 3, expression(symbol("'")))
text(1, 4, "therefore", adj = 0)
text(2.5, 4, "\\134")
text(3, 4, expression(symbol("\\")))
text(1, 5, "perpendicular", adj = 0)
text(2.5, 5, "\\136")
text(3, 5, expression(symbol("^")))
text(1, 6, "circlemultiply", adj = 0)
text(2.5, 6, "\\304")
text(3, 6, expression(symbol("\xc4")))
text(1, 7, "circleplus", adj = 0)
text(2.5, 7, "\\305")
text(3, 7, expression(symbol("\xc5")))
text(1, 8, "emptyset", adj = 0)
text(2.5, 8, "\\306")
text(3, 8, expression(symbol("\xc6")))
text(1, 9, "angle", adj = 0)
text(2.5, 9, "\\320")
text(3, 9, expression(symbol("\xd0")))
text(1, 10, "leftangle", adj = 0)
text(2.5, 10, "\\341")
text(3, 10, expression(symbol("\xe1")))
text(1, 11, "rightangle", adj = 0)
text(2.5, 11, "\\361")
text(3, 11, expression(symbol("\xf1")))
```
```{r,}
library(magrittr)
```
```{r,}
library(PoEdata)
data("cps_small")
plot(cps_small$educ, cps_small$wage, xlab = "education", ylab = "wage")
plot(cps_small$educ, cps_small$wage, xlab = "Educação", ylab = "Renda")
```
```{r,}
head(cps_small, 15)
| wage | educ | exper | female | black | white | midwest | south | west |
|---|---|---|---|---|---|---|---|---|
| 2.03 | 13 | 2 | 1 | 0 | 1 | 0 | 1 | 0 |
| 2.07 | 12 | 7 | 0 | 0 | 1 | 1 | 0 | 0 |
| 2.12 | 12 | 35 | 0 | 0 | 1 | 0 | 1 | 0 |
| 2.54 | 16 | 20 | 1 | 0 | 1 | 0 | 1 | 0 |
| 2.68 | 12 | 24 | 1 | 0 | 1 | 0 | 1 | 0 |
| 3.09 | 13 | 4 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3.16 | 13 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 3.17 | 12 | 22 | 1 | 0 | 1 | 0 | 1 | 0 |
| 3.20 | 12 | 23 | 0 | 0 | 1 | 0 | 1 | 0 |
| 3.27 | 12 | 4 | 1 | 0 | 1 | 0 | 0 | 1 |
| 3.32 | 12 | 11 | 1 | 0 | 1 | 0 | 0 | 1 |
| 3.32 | 13 | 3 | 1 | 0 | 1 | 1 | 0 | 0 |
| 3.34 | 18 | 15 | 0 | 0 | 1 | 1 | 0 | 0 |
| 3.39 | 13 | 7 | 1 | 0 | 1 | 0 | 0 | 0 |
| 3.39 | 12 | 15 | 1 | 0 | 1 | 0 | 0 | 1 |
```
```{r,}
plot(cps_small$exper, cps_small$wage, col = "gray", pch = 20)
```
```{r,}
library(PoEdata)
data(food)
head(food)
| food_exp | income |
|---|---|
| 115.22 | 3.69 |
| 135.98 | 4.39 |
| 119.34 | 4.75 |
| 114.96 | 6.03 |
| 187.05 | 12.47 |
| 243.92 | 12.98 |
# help(food)
```
```{r,}
data("food", package = "PoEdata")
plot(food$income, food$food_exp)
# Gráfico de dispersão ou scatter plot
plot(food$income, food$food_exp, ylim = c(0, max(food$food_exp)),
xlim = c(0, max(food$income)), xlab = "weekly income in $100",
ylab = "weekly food expenditure in $", type = "p")
```
\[ food\_exp = \beta_0 + \beta_1 income +e\\[1em] \widehat{food\_exp} = \beta_0 + \beta_1 income \]
```{r,}
library(PoEdata)
# roda a regressão
mod1 <- lm(formula = food_exp ~ income, data = food)
# olha os coeficientes
mod1$coefficients
## (Intercept) income
## 83.41600 10.20964
# ou
coef(mod1)
## (Intercept) income
## 83.41600 10.20964
# um por um
mod1$coefficients[1]
## (Intercept)
## 83.416
mod1$coefficients[2]
## income
## 10.20964
# ou
(b1 <- coef(mod1)[[1]])
## [1] 83.416
(b2 <- coef(mod1)[[2]])
## [1] 10.20964
# mostra o resultado da regressão
smod1 <- summary(mod1)
smod1
##
## Call:
## lm(formula = food_exp ~ income, data = food)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.025 -50.816 -6.324 67.879 212.044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.416 43.410 1.922 0.0622 .
## income 10.210 2.093 4.877 1.95e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared: 0.385, Adjusted R-squared: 0.3688
## F-statistic: 23.79 on 1 and 38 DF, p-value: 1.946e-05
```
```{r,}
plot(food$income, food$food_exp, xlab = "Renda semanal em $100",
ylab = "Despesa com comida em $", ylim = c(0, max(food$food_exp)),
xlim = c(0, max(food$income)), type = "p", col = "lightblue",
pch = 16, frame.plot = FALSE, axes = FALSE, main = "Despesa com comida versus renda")
axis(1, pos = 0)
axis(2, pos = 0)
# abline(b1,b2)
abline(mod1, col = "blue")
abline(h = b1, col = "darkgray", lty = 2)
```
Qual a despesa com comida de um indivíduo que ganha $2000 por semana?
\[ \widehat{food\_exp} = 83.416 + 10.210 \cdot income\\ \widehat{food\_exp} = 83.416 + 10.210 \cdot 20\\ \]
```{r,}
coef(mod1)[1] + coef(mod1)[2] * 20
## (Intercept)
## 287.6089
predict(mod1, data.frame(income = 20))
## 1
## 287.6089
```
Tecnicamente isso se chama bootstrap e trataremos depois.
Será útil mais tarde.
```{r,}
library(PoEdata)
data(br)
# testando uma relação quadrática
mod3 <- lm(formula = price ~ I(sqft^2), data = br)
# versus uma do primeiro grau
mod3.a <- lm(formula = price ~ sqft, data = br)
(summary(mod3) -> s3)
##
## Call:
## lm(formula = price ~ I(sqft^2), data = br)
##
## Residuals:
## Min 1Q Median 3Q Max
## -696604 -23366 779 21869 713159
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.578e+04 2.890e+03 19.30 <2e-16 ***
## I(sqft^2) 1.542e-02 3.131e-04 49.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68210 on 1078 degrees of freedom
## Multiple R-squared: 0.6923, Adjusted R-squared: 0.6921
## F-statistic: 2426 on 1 and 1078 DF, p-value: < 2.2e-16
(summary(mod3.a) -> s3a)
##
## Call:
## lm(formula = price ~ sqft, data = br)
##
## Residuals:
## Min 1Q Median 3Q Max
## -366641 -31399 -1535 25601 932272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -60861.462 6110.187 -9.961 <2e-16 ***
## sqft 92.747 2.411 38.476 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 79820 on 1078 degrees of freedom
## Multiple R-squared: 0.5786, Adjusted R-squared: 0.5783
## F-statistic: 1480 on 1 and 1078 DF, p-value: < 2.2e-16
# desenhando os dados com as curvas de regressão
plot(br$sqft, br$price, pch = 20, col = "gray")
x = seq(min(br$sqft), max(br$sqft), length.out = 100)
y = predict(mod3, data.frame(sqft = x))
abline(mod3.a, col = "red", lwd = 2)
lines(x, y, col = "blue", lwd = 2)
legend("topleft", legend = c(bquote(paste("Primeiro grau: ",
R[aj]^2 == .(s3a$adj.r.squared %>%
round(4)))), bquote(paste("Segundo grau: ", R[aj]^2 ==
.(s3$adj.r.squared %>%
round(4))))), cex = 0.8, fill = c("red", "blue"))
grid()
b1 <- coef(mod3)[[1]]
b2 <- coef(mod3)[[2]]
sqftx = c(2000, 4000, 6000) #given values for sqft
pricex = b1 + b2 * sqftx^2 #prices corresponding to given sqft
DpriceDsqft <- 2 * b2 * sqftx # marginal effect of sqft on price
elasticity = DpriceDsqft * sqftx/pricex
b1
## [1] 55776.57
b2
## [1] 0.0154213
DpriceDsqft
## [1] 61.68521 123.37041 185.05562
elasticity #prints results
## [1] 1.050303 1.631251 1.817408
```
```{r,}
hist(br$price)
```
Transformação de variáveis
```{r,}
hist(log(br$price))
```
```{r,}
plot(br$sqft, log(br$price))
```
Variável indicadora = dummy
\[ dummy \in \{0, 1\} \]
utown = university town
```{r,}
data(utown)
head(utown)
| price | sqft | age | utown | pool | fplace |
|---|---|---|---|---|---|
| 205.452 | 23.46 | 6 | 0 | 0 | 1 |
| 185.328 | 20.03 | 5 | 0 | 0 | 1 |
| 248.422 | 27.77 | 6 | 0 | 0 | 0 |
| 154.690 | 20.17 | 1 | 0 | 0 | 0 |
| 221.801 | 26.45 | 0 | 0 | 0 | 1 |
| 199.119 | 21.56 | 6 | 0 | 0 | 1 |
```
```{r,}
mod5 = lm(price ~ utown, data = utown)
summary(mod5)
##
## Call:
## lm(formula = price ~ utown, data = utown)
##
## Residuals:
## Min 1Q Median 3Q Max
## -85.672 -20.359 -0.462 20.646 67.955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 215.732 1.318 163.67 <2e-16 ***
## utown 61.509 1.830 33.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28.91 on 998 degrees of freedom
## Multiple R-squared: 0.5311, Adjusted R-squared: 0.5306
## F-statistic: 1130 on 1 and 998 DF, p-value: < 2.2e-16
```
Fora de utown preço = 215.732 (215.7324948)
Dentro de utown preço = 215.7324948 + 61.5091064 = 277.2416012
```{r,}
mean(utown[utown$utown == 1, "price"])
## [1] 277.2416
mean(utown[utown$utown == 0, "price"])
## [1] 215.7325
```
```{r,}
library(magrittr)
utown[utown$utown == 1, "price"] %>%
mean
## [1] 277.2416
utown[utown$utown == 0, "price"] %>%
mean
## [1] 215.7325
```
Vamos ver depois
```{r,}
library(PoEdata)
data("food")
alpha <- 0.05 # chosen significance level
mod1 <- lm(food_exp ~ income, data = food)
b2 <- coef(mod1)[[2]]
df <- df.residual(mod1) # degrees of freedom
smod1 <- summary(mod1)
seb2 <- coef(smod1)[2, 2] # se(b2)
tc <- qt(1 - alpha/2, df)
lowb <- b2 - tc * seb2 # lower bound
upb <- b2 + tc * seb2 # upper bound
c(lowb, b2, upb)
## [1] 5.972052 10.209643 14.447233
```
Tenho 1-significância = 1 - 0.05 = 0.95 = 95% de CONFIANÇA que o valor do coeficiente angular está situado entre 5.9720525 e 14.4472334.
```{r,}
confint(mod1, level = 0.95)
| 2.5 % | 97.5 % | |
|---|---|---|
| (Intercept) | -4.463279 | 171.29528 |
| income | 5.972053 | 14.44723 |
```
Reamostragem
```{r,}
# Travo o gerador de números pseudo-aleatórios
set.seed(1)
# Número de simulações
N = 500 # coloquei 500 para rodar mais rápido, na prática usa-se 2000 ou mais
# Número de elementos na amostra
nrow(br) -> n
# Inicializo vetor de valores
valores = NULL
# loop de reamostragem
for (i in 1:N) {
# crio amostra de tamanho n com repetição
sample(1:n, n, replace = TRUE) -> idx
# faço a regressão
lm(price ~ I(sqft^2), data = br[idx, ]) -> modb
# guardo o valor do coeficiente angular
valores = c(valores, modb$coefficients[2])
}
```
```{r,}
# desenho um histograma com uma curva normal superimposta
# Este esquema de cores é somente um exemplo, adote um
# padrão para todos os gráficos para não ficar um
# 'carnaval'
hist(valores, freq = FALSE, col = "#FFA00080", border = "white")
curve(dnorm(x, mean(valores), sd(valores)), xlim = c(min(valores),
max(valores)), add = TRUE, col = "darkgreen", lwd = 2)
c(mod3$coefficients[2], mean(valores))
## I(sqft^2)
## 0.01542130 0.01535264
# Teste de normalidade (veremos em um futuro próximo)
shapiro.test(sample(valores, min(500, length(valores))))
##
## Shapiro-Wilk normality test
##
## data: sample(valores, min(500, length(valores)))
## W = 0.99399, p-value = 0.04537
```
```{r,}
# file.choose()
library(openxlsx)
read.xlsx("/Users/jfrega/Downloads/DadosTeste.xlsx", sheet = "Planilha1",
startRow = 1) -> meusDados
plot(meusDados$x, meusDados$y)
lm(y ~ x, meusDados) -> m
m %>%
summary
##
## Call:
## lm(formula = y ~ x, data = meusDados)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9643 -1.2054 0.2679 1.1696 1.5000
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.6429 1.1198 5.932 0.00102 **
## x 2.1071 0.2218 9.502 7.75e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.437 on 6 degrees of freedom
## Multiple R-squared: 0.9377, Adjusted R-squared: 0.9273
## F-statistic: 90.29 on 1 and 6 DF, p-value: 7.745e-05
```
#````
```{r,engine="python"}
#
# ATENÇÃO: para rodar este trecho do código é necessário ter o Python instalado e configurado
#
# o comando import do Python é similar ao comando library do R
# statsmodels.formula.api é a interface para os modelos estatísticos
import statsmodels.formula.api as smf
# vou acessar os dados que foram lidos no meu código em R
r.meusDados
## x y
## 0 1.0 10.0
## 1 2.0 12.0
## 2 3.0 11.0
## 3 4.0 15.0
## 4 5.0 16.0
## 5 6.0 18.0
## 6 7.0 22.0
## 7 8.0 25.0
# rodo o modelo OLS
model = smf.ols(formula="y~x", data=r.meusDados)
# inspeciono os resultados
print(model.fit().summary())
## OLS Regression Results
## ==============================================================================
## Dep. Variable: y R-squared: 0.938
## Model: OLS Adj. R-squared: 0.927
## Method: Least Squares F-statistic: 90.29
## Date: Wed, 02 Apr 2025 Prob (F-statistic): 7.75e-05
## Time: 09:54:00 Log-Likelihood: -13.102
## No. Observations: 8 AIC: 30.20
## Df Residuals: 6 BIC: 30.36
## Df Model: 1
## Covariance Type: nonrobust
## ==============================================================================
## coef std err t P>|t| [0.025 0.975]
## ------------------------------------------------------------------------------
## Intercept 6.6429 1.120 5.932 0.001 3.903 9.383
## x 2.1071 0.222 9.502 0.000 1.565 2.650
## ==============================================================================
## Omnibus: 2.183 Durbin-Watson: 1.522
## Prob(Omnibus): 0.336 Jarque-Bera (JB): 0.848
## Skew: -0.279 Prob(JB): 0.654
## Kurtosis: 1.505 Cond. No. 11.5
## ==============================================================================
##
## Notes:
## [1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
##
## /Users/jfrega/Library/r-miniconda-arm64/lib/python3.10/site-packages/scipy/stats/_axis_nan_policy.py:418: UserWarning: `kurtosistest` p-value may be inaccurate with fewer than 20 observations; only n=8 observations were given.
## return hypotest_fun_in(*args, **kwds)
```
#````
```{r,}
library(magrittr)
```
```{r,}
library(PoEdata)
data("food")
plot(x = food$income, y = food$food_exp, xlim = c(0, max(food$income)),
ylim = c(0, max(food$food_exp)), axes = FALSE)
axis(1, pos = 0)
axis(2, pos = 0)
alpha <- 0.05
x <- 20
xbar <- mean(food$income)
ybar <- mean(food$food_exp)
m1 <- lm(formula = food_exp ~ income, data = food)
abline(m1)
points(xbar, ybar, col = "red", pch = 16, cex = 2)
abline(h = ybar, v = xbar, col = "red", lty = 2)
predict(m1, data.frame(income = 20), interval = "confidence",
level = 0.95)
| fit | lwr | upr |
|---|---|---|
| 287.6089 | 258.9069 | 316.3108 |
predict(m1, data.frame(income = 20), interval = "prediction",
level = 0.95)
| fit | lwr | upr |
|---|---|---|
| 287.6089 | 104.1323 | 471.0854 |
summary(m1) -> sm1
sm1
##
## Call:
## lm(formula = food_exp ~ income, data = food)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.025 -50.816 -6.324 67.879 212.044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.416 43.410 1.922 0.0622 .
## income 10.210 2.093 4.877 1.95e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared: 0.385, Adjusted R-squared: 0.3688
## F-statistic: 23.79 on 1 and 38 DF, p-value: 1.946e-05
xx = seq(min(food$income), max(food$income), length.out = 50)
yy = predict(m1, data.frame(income = xx), interval = "prediction",
level = 1 - 0.05)
lines(xx, yy[, 2], col = "blue", lty = 3)
lines(xx, yy[, 3], col = "blue", lty = 3)
```
\[ \text{food_exp}= b_0+b_1\ \text{income}+e\\ \widehat{\text{food_exp}}= 83.416+10.210\ \text{income} \]
Bondade de ajuste
```{r,}
sm1
##
## Call:
## lm(formula = food_exp ~ income, data = food)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.025 -50.816 -6.324 67.879 212.044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.416 43.410 1.922 0.0622 .
## income 10.210 2.093 4.877 1.95e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 89.52 on 38 degrees of freedom
## Multiple R-squared: 0.385, Adjusted R-squared: 0.3688
## F-statistic: 23.79 on 1 and 38 DF, p-value: 1.946e-05
sm1$r.squared
## [1] 0.3850022
sm1$adj.r.squared
## [1] 0.3688181
sm1$fstatistic
## value numdf dendf
## 23.78884 1.00000 38.00000
```
```{r,}
mod2 <- lm(food_exp ~ log(income), data = food)
summary(mod2)
##
## Call:
## lm(formula = food_exp ~ log(income), data = food)
##
## Residuals:
## Min 1Q Median 3Q Max
## -215.427 -51.666 2.186 47.819 241.548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -97.19 84.24 -1.154 0.256
## log(income) 132.17 28.80 4.588 4.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 91.57 on 38 degrees of freedom
## Multiple R-squared: 0.3565, Adjusted R-squared: 0.3396
## F-statistic: 21.05 on 1 and 38 DF, p-value: 4.76e-05
```
```{r,}
# resíduos padronizados tem média zero e desvio-padrão um
m1$residuals %>%
scale -> padres
padres %>%
hist(freq = FALSE)
curve(dnorm(x, 0, 1), xlim = c(-3, 3), add = TRUE)
```
```{r,}
# teste de Shapiro-Wilk H0: não há desvios da normalidade
# (p > 0.05)
padres %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.98838, p-value = 0.9493
```
```{r,}
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(magrittr)
# Teste de Jarque-Bera H0: não há desvios da normalidade (p
# > 0.05)
padres %>%
jarque.bera.test()
##
## Jarque Bera Test
##
## data: .
## X-squared = 0.06334, df = 2, p-value = 0.9688
```
Testes de normalidade funcionam bem para n > 30 e n < 400 (valores empíricos e aproximados).
```{r,}
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:DescTools':
##
## Recode
padres %>%
qqPlot()
## [1] 31 38
```
```{r,}
set.seed(1)
rexp(20) %>%
qqPlot
## [1] 14 6
runif(20) %>%
qqPlot
## [1] 10 18
```
Teste RESET de Ramsey
```{r,}
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Teste RESET de Ramsey H0: forma funcional é adequada
reset(m1)
##
## RESET test
##
## data: m1
## RESET = 0.066009, df1 = 2, df2 = 36, p-value = 0.9362
```
Forma funcional adequada é que a representação da equação é funcionalmente adequada, ou seja, os termos estão nas potências e funções certas.
```{r,}
set.seed(12)
xx = seq(-3, 3, length.out = 50)
yy = 3 * xx^2 + 2 * rnorm(xx)
plot(xx, yy)
```
```{r,}
mteste = lm(yy ~ xx)
summary(mteste)
##
## Call:
## lm(formula = yy ~ xx)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.114 -6.409 -3.041 5.847 19.216
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.0815 1.1687 7.770 4.9e-10 ***
## xx 0.1050 0.6614 0.159 0.874
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.264 on 48 degrees of freedom
## Multiple R-squared: 0.0005252, Adjusted R-squared: -0.0203
## F-statistic: 0.02522 on 1 and 48 DF, p-value: 0.8745
```
```{r,}
reset(mteste)
##
## RESET test
##
## data: mteste
## RESET = 537.61, df1 = 2, df2 = 46, p-value < 2.2e-16
```
O teste RESET rejeitou a forma funcional utilizada.
```{r,}
mteste2 = lm(yy ~ I(xx^2))
reset(mteste2)
##
## RESET test
##
## data: mteste2
## RESET = 1.2401, df1 = 2, df2 = 46, p-value = 0.2988
```
Opa, agora a forma funcional é adequada, é quadrática!
```{r,}
set.seed(12)
xx = seq(-3, 3, length.out = 50)
yy = 3 * xx^3 + 4 * rnorm(xx)
plot(xx, yy)
```
```{r,}
mteste3 = lm(yy ~ I(xx^3))
reset(mteste3, power = 3)
##
## RESET test
##
## data: mteste3
## RESET = 0.0016132, df1 = 1, df2 = 47, p-value = 0.9681
summary(mteste3)
##
## Call:
## lm(formula = yy ~ I(xx^3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7571 -2.3349 -0.2141 1.9016 8.8388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.57172 0.49085 -1.165 0.25
## I(xx^3) 3.04184 0.04533 67.099 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.471 on 48 degrees of freedom
## Multiple R-squared: 0.9895, Adjusted R-squared: 0.9892
## F-statistic: 4502 on 1 and 48 DF, p-value: < 2.2e-16
```
```{r,}
plot(xx, yy)
abline((lm(yy ~ xx) -> mteste1), col = "green")
mteste2 = lm(yy ~ I(xx^2))
lines(xx, predict(mteste2, data.frame(xx = xx)), col = "red")
mteste3 = lm(yy ~ I(xx^3))
lines(xx, predict(mteste3, data.frame(xx = xx)), col = "blue")
legend("topleft", legend = c("Grau 1", "Grau 2", "Grau 3"), fill = c("green",
"red", "blue"), cex = 0.7)
```
```{r,}
mteste1 %>%
reset
##
## RESET test
##
## data: .
## RESET = 377.37, df1 = 2, df2 = 46, p-value < 2.2e-16
mteste1$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.96821, p-value = 0.1956
mteste2 %>%
reset
##
## RESET test
##
## data: .
## RESET = 0.011796, df1 = 2, df2 = 46, p-value = 0.9883
mteste2$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.94613, p-value = 0.02371
mteste3 %>%
reset
##
## RESET test
##
## data: .
## RESET = 0.46335, df1 = 2, df2 = 46, p-value = 0.6321
mteste3$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.96619, p-value = 0.1613
summary(mteste3)
##
## Call:
## lm(formula = yy ~ I(xx^3))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.7571 -2.3349 -0.2141 1.9016 8.8388
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.57172 0.49085 -1.165 0.25
## I(xx^3) 3.04184 0.04533 67.099 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.471 on 48 degrees of freedom
## Multiple R-squared: 0.9895, Adjusted R-squared: 0.9892
## F-statistic: 4502 on 1 and 48 DF, p-value: < 2.2e-16
```
```{r,}
plot(m1$model[, 2:1])
abline(m1)
```
```{r,}
# Teste de Breusch-Pagan H0: os dados são homoscedásticos
m1 %>%
bptest
##
## studentized Breusch-Pagan test
##
## data: .
## BP = 7.3844, df = 1, p-value = 0.006579
```
O p-valor abaixo de 0.05 rejeitou a H0, ou seja, os dados são heteroscedásticos.
```{r,}
data("newbroiler", package = "PoEdata")
mod.a <- lm(q ~ p, newbroiler)
plot(mod.a$model[, 2:1])
abline(mod.a)
mod.a$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.88678, p-value = 0.0001358
mod.a$residuals %>%
qqPlot(main = "Resíduos muito ruins")
## [1] 52 50
mod.a %>%
reset
##
## RESET test
##
## data: .
## RESET = 41.974, df1 = 2, df2 = 48, p-value = 2.885e-11
# mod.a é inadequado
mod6 <- lm(log(q) ~ log(p), data = newbroiler)
plot(mod6$model[, 2:1])
abline(mod6)
mod6$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.97287, p-value = 0.2787
mod6$residuals %>%
scale %>%
hist
mod6$residuals %>%
qqPlot(main = "Resíduos mais adequados")
## [1] 34 50
mod6 %>%
reset
##
## RESET test
##
## data: .
## RESET = 9.0626, df1 = 2, df2 = 48, p-value = 0.0004581
nrow(newbroiler)
## [1] 52
mod6 %>%
bptest
##
## studentized Breusch-Pagan test
##
## data: .
## BP = 2.5034, df = 1, p-value = 0.1136
mod6.a <- lm(log(q) ~ I(log(p)^3), data = newbroiler)
mod6.a$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.94107, p-value = 0.01238
mod6.a$residuals %>%
scale %>%
hist
mod6.a$residuals %>%
qqPlot()
## [1] 52 51
mod6.a %>%
reset
##
## RESET test
##
## data: .
## RESET = 32.957, df1 = 2, df2 = 48, p-value = 9.816e-10
mod.a.a <- lm(q ~ p + I(p^2) + I(p^3), newbroiler)
# aproximação de terceiro grau
mod.a.a$residuals %>%
scale %>%
shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: .
## W = 0.94371, p-value = 0.01589
mod.a.a$residuals %>%
scale %>%
hist
mod.a.a %>%
reset
##
## RESET test
##
## data: .
## RESET = 2.6918, df1 = 2, df2 = 46, p-value = 0.07843
```
```{r,}
summary(mod6)
##
## Call:
## lm(formula = log(q) ~ log(p), data = newbroiler)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.228363 -0.080077 -0.007662 0.106041 0.218679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.71694 0.02236 166.2 <2e-16 ***
## log(p) -1.12136 0.04876 -23.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.118 on 50 degrees of freedom
## Multiple R-squared: 0.9136, Adjusted R-squared: 0.9119
## F-statistic: 529 on 1 and 50 DF, p-value: < 2.2e-16
# R^2 generalizado
cor(mod6$fitted.values, log(newbroiler$q))^2
## [1] 0.9136386
```
```{r,}
modelo.linear.newbroiler = lm(q ~ p, data = newbroiler)
modelo.quadratico.newbroiler = lm(q ~ I(p^2), data = newbroiler)
modelo.quadratico.newbroiler %>%
summary
##
## Call:
## lm(formula = q ~ I(p^2), data = newbroiler)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.128 -5.888 -3.190 6.574 15.670
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.415 1.702 24.915 < 2e-16 ***
## I(p^2) -4.650 0.549 -8.471 3.14e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.757 on 50 degrees of freedom
## Multiple R-squared: 0.5893, Adjusted R-squared: 0.5811
## F-statistic: 71.75 on 1 and 50 DF, p-value: 3.138e-11
```
Sobreajustamento ou overfitting é o fenômeno em que o modelo tem uma complexidade excessiva para explicar o fenômeno, fazendo excelentes previsões para os pontos conhecidos da amostra, mas não conseguindo lidar bem com pontos desconhecidos.
No exemplo a seguir, temos um conjunto de pontos que pode ser adequadamente aproximado por uma reta mas que foi sobreajustado por um polinômio de quarto grau e outro de nono grau.
```{r,fig.asp=1, fig.width=5}
# semente aleatória
set.seed(112)
# 10 valores de x
x = 1:10
# 10 valores de y
y = 2 * x + 2 * rnorm(x)
# regressão por um polinômio de grau 9
lm(y ~ x + I(x^2) + I(x^3) + I(x^4) + I(x^5) + I(x^6) + I(x^7) +
I(x^8) + I(x^9)) -> m9
# regressão por um polinômio de grau 4
lm(y ~ x + I(x^2) + I(x^3) + I(x^4)) -> m4
# regressão linear
lm(y ~ x) -> m1
# calcula a curva polinomial
xx = seq(-0.3 + min(x), max(x) + 0.2, length.out = 100)
yy = predict(m9, data.frame(x = xx))
yy4 = predict(m4, data.frame(x = xx))
# plota os pontos respeitando os máximos e mínimos da curva
# polinomial
plot(x, y, ylim = c(min(c(0, y, yy, yy4)), max(c(y, yy, yy4)) *
1.01), xlim = c(min(c(0, x, xx)), max(c(x, xx)) * 1.01),
xlab = expression(x), ylab = expression(y), pch = 20, col = "#C00000")
# desenha a curva polinomial
lines(xx, yy, col = "red")
lines(xx, yy4, col = "darkgray")
# resenha uma simples reta de regressão
abline(m1, col = "blue")
grid()
legend("topleft", legend = paste("Grau", c(1, 4, 9)), fill = c("blue",
"darkgray", "red"), cex = 0.7)
```
Percebe-se que no polinômio de grau 9 o ajuste ficará muito ruim para pontos fora da amostra, em especial aqueles que ultrapassarem os limites desta.
```{r,}
library(magrittr)
```
Supondo que nossa amostra possui \(m\) variáveis explicativas e \(n\) casos observados
\[ \widehat{y} = b_0 + b_1x_1 + b_2x_2+\cdots+b_mx_m \]
\[ sales=\beta_0+\beta_1price+\beta_2advert+e \]
\(H_1: \beta_1\) deve ser negativo
\(H_2: \beta_2\) deve ser positivo
Freund, J. E., & Simon, G. A. 2000. Estatística aplicada: Economia, administração e contabilidade (9th ed.). Porto Alegre: Bookman.
minhabiblioteca.ufpr.br
```{r,}
library(PoEdata)
data(andy)
head(andy)
| sales | price | advert |
|---|---|---|
| 73.2 | 5.69 | 1.3 |
| 71.8 | 6.49 | 2.9 |
| 62.4 | 5.63 | 0.8 |
| 67.4 | 6.22 | 0.7 |
| 89.3 | 5.02 | 1.5 |
| 70.3 | 6.41 | 1.3 |
plot(andy$price, andy$sales)
plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
pch = 20, col = "darkgray")
```
```{r,}
library(xtable)
mod1 <- lm(sales ~ price + advert, data = andy)
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.3
kable(summary(mod1) %>%
xtable %>%
data.frame)
| Estimate | Std..Error | t.value | Pr…t.. | |
|---|---|---|---|---|
| (Intercept) | 118.913610 | 6.3516375 | 18.721725 | 0.0000000 |
| price | -7.907854 | 1.0959930 | -7.215242 | 0.0000000 |
| advert | 1.862584 | 0.6831955 | 2.726283 | 0.0080382 |
summary(mod1)
##
## Call:
## lm(formula = sales ~ price + advert, data = andy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4825 -3.1434 -0.3456 2.8754 11.3049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118.9136 6.3516 18.722 < 2e-16 ***
## price -7.9079 1.0960 -7.215 4.42e-10 ***
## advert 1.8626 0.6832 2.726 0.00804 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.886 on 72 degrees of freedom
## Multiple R-squared: 0.4483, Adjusted R-squared: 0.4329
## F-statistic: 29.25 on 2 and 72 DF, p-value: 5.041e-10
```
```{r,}
library(lmtest)
# Teste 1: normalidade dos resíduos estará tudo OK se eu
# não rejeitar a H0 de normalidade
shapiro.test(mod1$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod1$residuals
## W = 0.98797, p-value = 0.7028
# Teste 2: forma funcional (teste RESET de Ramsey) se
# aceitar H0 a forma funcional é adequada
reset(mod1)
##
## RESET test
##
## data: mod1
## RESET = 0.53369, df1 = 2, df2 = 70, p-value = 0.5888
# Teste 3: heteroscedasticidade se não rejeitar H0 os
# resultados são homoscedásticos
bptest(mod1)
##
## studentized Breusch-Pagan test
##
## data: mod1
## BP = 2.5722, df = 2, p-value = 0.2763
```
```{r,}
library(effects)
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
effprice <- effect("price", mod1)
plot(effprice)
```
```{r,}
alleffandy <- allEffects(mod1)
plot(alleffandy)
```
```{r,}
mod2 <- lm(sales ~ price + advert + I(advert^2), data = andy)
summary(mod2)
##
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.2553 -3.1430 -0.0117 2.8513 11.8050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 109.7190 6.7990 16.137 < 2e-16 ***
## price -7.6400 1.0459 -7.304 3.24e-10 ***
## advert 12.1512 3.5562 3.417 0.00105 **
## I(advert^2) -2.7680 0.9406 -2.943 0.00439 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.645 on 71 degrees of freedom
## Multiple R-squared: 0.5082, Adjusted R-squared: 0.4875
## F-statistic: 24.46 on 3 and 71 DF, p-value: 5.6e-11
```
```{r,}
library(lmtest)
# Teste 1: normalidade dos resíduos estará tudo OK se eu
# não rejeitar a H0 de normalidade
shapiro.test(mod2$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod2$residuals
## W = 0.98775, p-value = 0.6894
# Teste 2: forma funcional (teste RESET de Ramsey) se
# aceitar H0 a forma funcional é adequada
reset(mod2)
##
## RESET test
##
## data: mod2
## RESET = 2.7695, df1 = 2, df2 = 69, p-value = 0.06967
# Teste 3: heteroscedasticidade se não rejeitar H0 os
# resultados são homoscedásticos
bptest(mod1)
##
## studentized Breusch-Pagan test
##
## data: mod1
## BP = 2.5722, df = 2, p-value = 0.2763
```
```{r,}
alleffandy <- allEffects(mod2)
plot(alleffandy)
```
```{r,}
plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
pch = 20, col = "darkgray")
adv = seq(min(andy$advert), max(andy$advert), length.out = 100)
y = predict(mod1, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "red", lwd = 2)
y = predict(mod2, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "blue", lwd = 2)
```
```{r,}
plot(mod1)
```
```{r,}
plot(mod2)
```
Matrizes de covariância
```{r,}
vcov(mod1)
| (Intercept) | price | advert | |
|---|---|---|---|
| (Intercept) | 40.3432990 | -6.7950641 | -0.7484206 |
| price | -6.7950641 | 1.2012007 | -0.0197422 |
| advert | -0.7484206 | -0.0197422 | 0.4667561 |
vcov(mod2)
| (Intercept) | price | advert | I(advert^2) | |
|---|---|---|---|---|
| (Intercept) | 46.227019 | -6.4261130 | -11.6009601 | 2.9390263 |
| price | -6.426113 | 1.0939881 | 0.3004062 | -0.0856191 |
| advert | -11.600960 | 0.3004062 | 12.6463020 | -3.2887457 |
| I(advert^2) | 2.939026 | -0.0856191 | -3.2887457 | 0.8847736 |
cbind(andy$sales, andy$price, andy$advert, andy$advert^2) %>%
var
| 42.101106 | -2.1042341 | 1.1984270 | 3.3387205 |
| -2.104234 | 0.2687718 | 0.0113681 | 0.0682647 |
| 1.198427 | 0.0113681 | 0.6916865 | 2.5721319 |
| 3.338720 | 0.0682647 | 2.5721319 | 9.8969231 |
```
Conseguimos comprovar as nossas hipóteses?
Conseguimos controlar as hipóteses alternativas?
\(R^2_{aj}(mod1)=0.4329316\) e \(R^2_{aj}(mod2)=0.4874564\)
```{r,}
```
```{r,}
qnorm(1 - 0.05/2) %>%
round(4)
## [1] 1.96
qt(1 - 0.05/2, 5000 - 2)
## [1] 1.960439
qt(1 - 0.05/2, 500 - 2)
## [1] 1.964739
qt(1 - 0.05/2, 200 - 2)
## [1] 1.972017
qt(1 - 0.05/2, 120 - 2)
## [1] 1.980272
qt(1 - 0.05/2, 30 - 2)
## [1] 2.048407
```
```{r,}
mod2
##
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
##
## Coefficients:
## (Intercept) price advert I(advert^2)
## 109.719 -7.640 12.151 -2.768
```
$$ = 109.719 -7.640 price + 12.151 advert -2.768 advert^2
$$
Qual o valor máximo a ser investido em propaganda?
Fica perto do ponto onde a parábola atinge o seu máximo.
Ponto de máximo:
\[ advert = \dfrac{-b}{2a} = \dfrac{-12.151}{2(-2.768)} = \dfrac{12.151}{5.536} \approx 2.194906 \]
```{r,}
12.151/(2 * 2.768)
## [1] 2.194906
```
```{r,}
plot(andy$advert, andy$sales, ylim = c(min(andy$sales), max(andy$sales)),
pch = 20, col = "darkgray")
adv = seq(min(andy$advert), max(andy$advert), length.out = 100)
y = predict(mod1, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "red", lwd = 2)
y = predict(mod2, data.frame(advert = adv, price = mean(andy$price)))
lines(adv, y, col = "blue", lwd = 2)
xmax = 12.151/(2 * 2.768)
ymax = predict(mod2, data.frame(price = mean(andy$price), advert = xmax))
abline(v = xmax, h = ymax, lty = 2, col = "blue")
points(xmax, ymax, col = "blue")
text(xmax, ymax, paste("(", xmax %>%
round(2), ", ", ymax %>%
round(2), ")", sep = ""), pos = 3, col = "blue")
grid()
mean(andy$price)
## [1] 5.6872
```
```{r,}
data("pizza4", package = "PoEdata")
head(pizza4)
| pizza | female | hs | college | grad | income | age |
|---|---|---|---|---|---|---|
| 109 | 1 | 0 | 0 | 0 | 19.5 | 25 |
| 0 | 1 | 0 | 0 | 0 | 39.0 | 45 |
| 0 | 1 | 0 | 0 | 0 | 15.6 | 20 |
| 108 | 1 | 0 | 0 | 0 | 26.0 | 28 |
| 220 | 1 | 1 | 0 | 0 | 19.5 | 25 |
| 189 | 1 | 1 | 0 | 0 | 39.0 | 35 |
`?`(pizza4)
| pizza4 | R Documentation |
Obs: 40 individuals
data("pizza4")
A data frame with 40 observations on the following 7 variables.
pizzaannual pizza expenditure, $
female=1 if female
hs=1 if highest degree received is high school diploma
college=1 if highest degree received is a college diploma
grad=1 if highest degree received is a post graduate degree
incomeannual income in thousands of dollars
ageage in years
http://principlesofeconometrics.com/poe4/poe4.htm
data(pizza4)
## maybe str(pizza4) ; plot(pizza4) ...
```
```{r,}
mod3 = lm(pizza ~ income, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ income, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -260.17 -103.81 -49.86 122.59 337.12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 128.9803 34.5913 3.729 0.000626 ***
## income 1.1213 0.4595 2.440 0.019461 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 146.8 on 38 degrees of freedom
## Multiple R-squared: 0.1355, Adjusted R-squared: 0.1127
## F-statistic: 5.954 on 1 and 38 DF, p-value: 0.01946
mod3 = lm(pizza ~ age, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ age, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -235.90 -131.28 -18.39 107.80 393.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 301.728 84.206 3.583 0.000951 ***
## age -3.291 2.408 -1.367 0.179671
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 154.2 on 38 degrees of freedom
## Multiple R-squared: 0.04687, Adjusted R-squared: 0.02179
## F-statistic: 1.869 on 1 and 38 DF, p-value: 0.1797
mod3 = lm(pizza ~ age + income, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ age + income, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -219.96 -116.36 27.85 88.02 287.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 342.8848 72.3434 4.740 3.14e-05 ***
## age -7.5756 2.3170 -3.270 0.002333 **
## income 1.8325 0.4643 3.947 0.000341 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 131.1 on 37 degrees of freedom
## Multiple R-squared: 0.3293, Adjusted R-squared: 0.293
## F-statistic: 9.081 on 2 and 37 DF, p-value: 0.0006185
mod3 = lm(pizza ~ age + income + age:income, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ age + income + age:income, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200.86 -83.82 20.70 85.04 254.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.46543 120.66341 1.338 0.1892
## age -2.97742 3.35210 -0.888 0.3803
## income 6.97991 2.82277 2.473 0.0183 *
## age:income -0.12324 0.06672 -1.847 0.0730 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared: 0.3873, Adjusted R-squared: 0.3363
## F-statistic: 7.586 on 3 and 36 DF, p-value: 0.0004681
mod3 = lm(pizza ~ age + income + income:age, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ age + income + income:age, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200.86 -83.82 20.70 85.04 254.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.46543 120.66341 1.338 0.1892
## age -2.97742 3.35210 -0.888 0.3803
## income 6.97991 2.82277 2.473 0.0183 *
## age:income -0.12324 0.06672 -1.847 0.0730 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared: 0.3873, Adjusted R-squared: 0.3363
## F-statistic: 7.586 on 3 and 36 DF, p-value: 0.0004681
mod3 = lm(pizza ~ age * income, data = pizza4)
mod3 %>%
summary
##
## Call:
## lm(formula = pizza ~ age * income, data = pizza4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -200.86 -83.82 20.70 85.04 254.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.46543 120.66341 1.338 0.1892
## age -2.97742 3.35210 -0.888 0.3803
## income 6.97991 2.82277 2.473 0.0183 *
## age:income -0.12324 0.06672 -1.847 0.0730 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127 on 36 degrees of freedom
## Multiple R-squared: 0.3873, Adjusted R-squared: 0.3363
## F-statistic: 7.586 on 3 and 36 DF, p-value: 0.0004681
```
```{r,}
data(cps4_small)
head(cps4_small)
| wage | educ | exper | hrswk | married | female | metro | midwest | south | west | black | asian |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 18.70 | 16 | 39 | 37 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 |
| 11.50 | 12 | 16 | 62 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 15.04 | 16 | 13 | 40 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 |
| 25.95 | 14 | 11 | 40 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 |
| 24.03 | 12 | 51 | 40 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 20.00 | 12 | 30 | 40 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
```
```{r,}
# lembram da função mhist que foi definida no arquivo
# 'Pequenos Experimentos.Rmd'? (dêem uma olhada lá) criei
# um arquivo que contém essa função (mhist.R) e para tornar
# essa funcionalidade disponível basta carregar o arquivo
# direto do github
source("https://raw.githubusercontent.com/jrfrega/2025/refs/heads/main/mhist.R")
# wage apresenta muitos outliers superiores
mhist(cps4_small$wage)
boxplot(cps4_small$wage)
# a transformação log reduz o problema a uns poucos
# outliers inferiores
mhist(cps4_small$wage %>%
log)
boxplot(cps4_small$wage %>%
log)
# Lembrem de que qualquer modelo precisa ser suportado pela
# adequada teoria aqui está sendo proposta uma interação
# entre educ e exper e uma utilidade decrescente da exper
mod4 = lm(log(wage) ~ educ * exper + I(exper^2), data = cps4_small)
summary(mod4)
##
## Call:
## lm(formula = log(wage) ~ educ * exper + I(exper^2), data = cps4_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.28227 -0.32856 -0.02725 0.33751 1.47088
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.297e-01 2.267e-01 2.336 0.01969 *
## educ 1.272e-01 1.472e-02 8.642 < 2e-16 ***
## exper 6.298e-02 9.536e-03 6.604 6.48e-11 ***
## I(exper^2) -7.139e-04 8.804e-05 -8.109 1.49e-15 ***
## educ:exper -1.322e-03 4.949e-04 -2.672 0.00766 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5057 on 995 degrees of freedom
## Multiple R-squared: 0.2445, Adjusted R-squared: 0.2415
## F-statistic: 80.52 on 4 and 995 DF, p-value: < 2.2e-16
with(cps4_small, plot(educ, wage))
with(cps4_small, plot(exper, wage))
```
Necessidade de indicadores mais robustos que o \(R^2\) para determinar a qualidade de uma regressão.
Modernamente, usa-se os critérios de informação
AIC (Akaike Information Criterion)
BIC (Bayesian Information Criterion)
```{r,}
library(broom)
mod1
##
## Call:
## lm(formula = sales ~ price + advert, data = andy)
##
## Coefficients:
## (Intercept) price advert
## 118.914 -7.908 1.863
mod2
##
## Call:
## lm(formula = sales ~ price + advert + I(advert^2), data = andy)
##
## Coefficients:
## (Intercept) price advert I(advert^2)
## 109.719 -7.640 12.151 -2.768
rbind(glance(mod1), glance(mod2))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.4482578 | 0.4329316 | 4.886124 | 29.24786 | 0 | 2 | -223.8695 | 455.739 | 465.0090 | 1718.943 | 72 | 75 |
| 0.5082352 | 0.4874564 | 4.645283 | 24.45932 | 0 | 3 | -219.5540 | 449.108 | 460.6954 | 1532.084 | 71 | 75 |
```
O melhor modelo é o que apresenta o menor valor para um dado critério de informação.
Com base no AIC, o modelo 1 (mod1) é pior que o modelo 2 (mod2) (455.739 > 449.108)
Os critérios de informação balanceiam a parcimônia do modelo com a qualidade do ajuste.
```{r,}
mod1$call
## lm(formula = sales ~ price + advert, data = andy)
mod2.1 = lm(formula = sales ~ price + advert + I(advert^2) +
I(price^2), data = andy)
rbind(glance(mod1), glance(mod2), glance(mod2.1))
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.4482578 | 0.4329316 | 4.886124 | 29.24786 | 0 | 2 | -223.8695 | 455.7390 | 465.0090 | 1718.943 | 72 | 75 |
| 0.5082352 | 0.4874564 | 4.645283 | 24.45932 | 0 | 3 | -219.5540 | 449.1080 | 460.6954 | 1532.084 | 71 | 75 |
| 0.5279827 | 0.5010102 | 4.583451 | 19.57491 | 0 | 4 | -218.0171 | 448.0341 | 461.9391 | 1470.561 | 70 | 75 |
mod2.2 = lm(formula = sales ~ price + advert + I(advert^2) +
I(advert^3) + I(price^2), data = andy)
rbind(glance(mod1), glance(mod2), glance(mod2.1), glance(mod2.2)) %>%
as.data.frame -> df
row.names(df) = c("mod1 (linear)", "mod2 (quadrático em advert)",
"mod2.1 (quadrático em tudo)", "mod2.2 (quadrático em price, cúbico em advert")
df[, c("adj.r.squared", "AIC", "BIC")]
| adj.r.squared | AIC | BIC | |
|---|---|---|---|
| mod1 (linear) | 0.4329316 | 455.7390 | 465.0090 |
| mod2 (quadrático em advert) | 0.4874564 | 449.1080 | 460.6954 |
| mod2.1 (quadrático em tudo) | 0.5010102 | 448.0341 | 461.9391 |
| mod2.2 (quadrático em price, cúbico em advert | 0.4938355 | 450.0257 | 466.2481 |
summary(mod2.1)
##
## Call:
## lm(formula = sales ~ price + advert + I(advert^2) + I(price^2),
## data = andy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.1771 -2.9378 0.0949 2.8897 11.6856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 248.8390 81.5712 3.051 0.003225 **
## price -57.0629 28.8988 -1.975 0.052262 .
## advert 13.1624 3.5582 3.699 0.000427 ***
## I(advert^2) -3.0896 0.9469 -3.263 0.001708 **
## I(price^2) 4.3364 2.5340 1.711 0.091454 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.583 on 70 degrees of freedom
## Multiple R-squared: 0.528, Adjusted R-squared: 0.501
## F-statistic: 19.57 on 4 and 70 DF, p-value: 7.553e-11
```
Pelo critério Akaike (AIC), o melhor modelo é mod2.1 (quadrático em tudo).
Pelo critério de Schwarz (AIC), o melhor modelo é mod2 (quadrático em advert).